{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 13. Machine learning techniques"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random as rd\n",
    "rd.seed(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 13.1 Loading and exploring the dataset\n",
    "\n",
    "First, we use pandas to load the dataset from a csv file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>887</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>Montvila, Rev. Juozas</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>211536</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>888</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Graham, Miss. Margaret Edith</td>\n",
       "      <td>female</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>112053</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>B42</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>889</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>W./C. 6607</td>\n",
       "      <td>23.4500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>890</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Behr, Mr. Karl Howell</td>\n",
       "      <td>male</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>111369</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>C148</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>891</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Dooley, Mr. Patrick</td>\n",
       "      <td>male</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>370376</td>\n",
       "      <td>7.7500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass  \\\n",
       "0              1         0       3   \n",
       "1              2         1       1   \n",
       "2              3         1       3   \n",
       "3              4         1       1   \n",
       "4              5         0       3   \n",
       "..           ...       ...     ...   \n",
       "886          887         0       2   \n",
       "887          888         1       1   \n",
       "888          889         0       3   \n",
       "889          890         1       1   \n",
       "890          891         0       3   \n",
       "\n",
       "                                                  Name     Sex   Age  SibSp  \\\n",
       "0                              Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                               Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                             Allen, Mr. William Henry    male  35.0      0   \n",
       "..                                                 ...     ...   ...    ...   \n",
       "886                              Montvila, Rev. Juozas    male  27.0      0   \n",
       "887                       Graham, Miss. Margaret Edith  female  19.0      0   \n",
       "888           Johnston, Miss. Catherine Helen \"Carrie\"  female   NaN      1   \n",
       "889                              Behr, Mr. Karl Howell    male  26.0      0   \n",
       "890                                Dooley, Mr. Patrick    male  32.0      0   \n",
       "\n",
       "     Parch            Ticket     Fare Cabin Embarked  \n",
       "0        0         A/5 21171   7.2500   NaN        S  \n",
       "1        0          PC 17599  71.2833   C85        C  \n",
       "2        0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3        0            113803  53.1000  C123        S  \n",
       "4        0            373450   8.0500   NaN        S  \n",
       "..     ...               ...      ...   ...      ...  \n",
       "886      0            211536  13.0000   NaN        S  \n",
       "887      0            112053  30.0000   B42        S  \n",
       "888      2        W./C. 6607  23.4500   NaN        S  \n",
       "889      0            111369  30.0000  C148        C  \n",
       "890      0            370376   7.7500   NaN        Q  \n",
       "\n",
       "[891 rows x 12 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data = pandas.read_csv('./titanic.csv')\n",
    "raw_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we can explore the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset has 891 rows\n"
     ]
    }
   ],
   "source": [
    "# Examining the length of the dataset\n",
    "print(\"The dataset has\", len(raw_data), \"rows\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Columns (features of the dataset)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n",
       "       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Examining the columns in the dataset\n",
    "print(\"Columns (features of the dataset)\")\n",
    "raw_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Labels\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0      0\n",
       "1      1\n",
       "2      1\n",
       "3      1\n",
       "4      0\n",
       "      ..\n",
       "886    0\n",
       "887    1\n",
       "888    0\n",
       "889    1\n",
       "890    0\n",
       "Name: Survived, Length: 891, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Examining the labels\n",
    "print(\"Labels\")\n",
    "raw_data[\"Survived\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "342 passengers survived out of 891\n"
     ]
    }
   ],
   "source": [
    "# Examining how many passengers survived\n",
    "print(sum(raw_data['Survived']),'passengers survived out of',len(raw_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>38.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>26.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>35.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>35.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>Montvila, Rev. Juozas</td>\n",
       "      <td>27.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>Graham, Miss. Margaret Edith</td>\n",
       "      <td>19.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>Behr, Mr. Karl Howell</td>\n",
       "      <td>26.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>Dooley, Mr. Patrick</td>\n",
       "      <td>32.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  Name   Age\n",
       "0                              Braund, Mr. Owen Harris  22.0\n",
       "1    Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0\n",
       "2                               Heikkinen, Miss. Laina  26.0\n",
       "3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0\n",
       "4                             Allen, Mr. William Henry  35.0\n",
       "..                                                 ...   ...\n",
       "886                              Montvila, Rev. Juozas  27.0\n",
       "887                       Graham, Miss. Margaret Edith  19.0\n",
       "888           Johnston, Miss. Catherine Helen \"Carrie\"   NaN\n",
       "889                              Behr, Mr. Karl Howell  26.0\n",
       "890                                Dooley, Mr. Patrick  32.0\n",
       "\n",
       "[891 rows x 2 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# One can look at several columns together\n",
    "raw_data[[\"Name\", \"Age\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 13.2. Cleaning up the data\n",
    "\n",
    "Now, let's look at how many columns have missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId      0\n",
       "Survived         0\n",
       "Pclass           0\n",
       "Name             0\n",
       "Sex              0\n",
       "Age            177\n",
       "SibSp            0\n",
       "Parch            0\n",
       "Ticket           0\n",
       "Fare             0\n",
       "Cabin          687\n",
       "Embarked         2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.isna().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The Cabin column is missing too many values to be useful. Let's drop it altogether."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       NaN\n",
       "1       C85\n",
       "2       NaN\n",
       "3      C123\n",
       "4       NaN\n",
       "       ... \n",
       "886     NaN\n",
       "887     B42\n",
       "888     NaN\n",
       "889    C148\n",
       "890     NaN\n",
       "Name: Cabin, Length: 891, dtype: object"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data['Cabin']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The Cabin column is missing 687 values out of 891\n"
     ]
    }
   ],
   "source": [
    "print(\"The Cabin column is missing\", sum(raw_data['Cabin'].isna()), \"values out of\",len(raw_data['Cabin']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_data = raw_data.drop('Cabin', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Embarked  \n",
       "0      0         A/5 21171   7.2500        S  \n",
       "1      0          PC 17599  71.2833        C  \n",
       "2      0  STON/O2. 3101282   7.9250        S  \n",
       "3      0            113803  53.1000        S  \n",
       "4      0            373450   8.0500        S  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Other columns such as Age or Embarked are missing some values, but they can still be useful.\n",
    "\n",
    "For the age column, let's fill in the missing values with the median of all ages.\n",
    "\n",
    "For the Embarked column, let's make a new category called 'U', for Unknown port of embarkment."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      22.0\n",
       "1      38.0\n",
       "2      26.0\n",
       "3      35.0\n",
       "4      35.0\n",
       "       ... \n",
       "886    27.0\n",
       "887    19.0\n",
       "888     NaN\n",
       "889    26.0\n",
       "890    32.0\n",
       "Name: Age, Length: 891, dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_data['Age']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "28.0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "median_age = raw_data[\"Age\"].median()\n",
    "median_age"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_data[\"Age\"] = clean_data[\"Age\"].fillna(median_age)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_data[\"Embarked\"] = clean_data[\"Embarked\"].fillna('U')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId    0\n",
       "Survived       0\n",
       "Pclass         0\n",
       "Name           0\n",
       "Sex            0\n",
       "Age            0\n",
       "SibSp          0\n",
       "Parch          0\n",
       "Ticket         0\n",
       "Fare           0\n",
       "Embarked       0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_data.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Embarked  \n",
       "0      0         A/5 21171   7.2500        S  \n",
       "1      0          PC 17599  71.2833        C  \n",
       "2      0  STON/O2. 3101282   7.9250        S  \n",
       "3      0            113803  53.1000        S  \n",
       "4      0            373450   8.0500        S  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 12.2.3 Saving our data for the future"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_data.to_csv('./clean_titanic_data.csv', index=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 12.3 Manipulating the features\n",
    "\n",
    "- One-hot encoding\n",
    "- Binning\n",
    "- Feature selection\n",
    "\n",
    "### 13.3.1 One-hot encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Embarked  \n",
       "0      0         A/5 21171   7.2500        S  \n",
       "1      0          PC 17599  71.2833        C  \n",
       "2      0  STON/O2. 3101282   7.9250        S  \n",
       "3      0            113803  53.1000        S  \n",
       "4      0            373450   8.0500        S  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessed_data = pandas.read_csv('clean_titanic_data.csv')\n",
    "preprocessed_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One-hot encoding the gender feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Sex_female  Sex_male\n",
      "0             0         1\n",
      "1             1         0\n",
      "2             1         0\n",
      "3             1         0\n",
      "4             0         1\n",
      "..          ...       ...\n",
      "886           0         1\n",
      "887           1         0\n",
      "888           1         0\n",
      "889           0         1\n",
      "890           0         1\n",
      "\n",
      "[891 rows x 2 columns]\n",
      "     Embarked_C  Embarked_Q  Embarked_S  Embarked_U\n",
      "0             0           0           1           0\n",
      "1             1           0           0           0\n",
      "2             0           0           1           0\n",
      "3             0           0           1           0\n",
      "4             0           0           1           0\n",
      "..          ...         ...         ...         ...\n",
      "886           0           0           1           0\n",
      "887           0           0           1           0\n",
      "888           0           0           1           0\n",
      "889           1           0           0           0\n",
      "890           0           1           0           0\n",
      "\n",
      "[891 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "gender_columns = pandas.get_dummies(preprocessed_data['Sex'], prefix='Sex')\n",
    "print(gender_columns)\n",
    "embarked_columns = pandas.get_dummies(preprocessed_data[\"Embarked\"], prefix=\"Embarked\")\n",
    "print(embarked_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessed_data = pandas.concat([preprocessed_data, gender_columns], axis=1)\n",
    "preprocessed_data = pandas.concat([preprocessed_data, embarked_columns], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessed_data = preprocessed_data.drop(['Sex', 'Embarked'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>Embarked_U</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name   Age  SibSp  Parch  \\\n",
       "0                            Braund, Mr. Owen Harris  22.0      1      0   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   \n",
       "2                             Heikkinen, Miss. Laina  26.0      0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   \n",
       "4                           Allen, Mr. William Henry  35.0      0      0   \n",
       "\n",
       "             Ticket     Fare  Sex_female  Sex_male  Embarked_C  Embarked_Q  \\\n",
       "0         A/5 21171   7.2500           0         1           0           0   \n",
       "1          PC 17599  71.2833           1         0           1           0   \n",
       "2  STON/O2. 3101282   7.9250           1         0           0           0   \n",
       "3            113803  53.1000           1         0           0           0   \n",
       "4            373450   8.0500           0         1           0           0   \n",
       "\n",
       "   Embarked_S  Embarked_U  \n",
       "0           1           0  \n",
       "1           0           0  \n",
       "2           1           0  \n",
       "3           1           0  \n",
       "4           1           0  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessed_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A rule of thumb for when to one-hot encode or not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "In first class 62.96296296296296 % of passengers survived\n",
      "In second class 47.28260869565217 % of passengers survived\n",
      "In third class 24.236252545824847 % of passengers survived\n"
     ]
    }
   ],
   "source": [
    "class_survived = preprocessed_data[['Pclass', 'Survived']]\n",
    "\n",
    "first_class = class_survived[class_survived['Pclass'] == 1]\n",
    "second_class = class_survived[class_survived['Pclass'] == 2]\n",
    "third_class = class_survived[class_survived['Pclass'] == 3]\n",
    "\n",
    "print(\"In first class\", sum(first_class['Survived'])/len(first_class)*100, \"% of passengers survived\")\n",
    "print(\"In second class\", sum(second_class['Survived'])/len(second_class)*100, \"% of passengers survived\")\n",
    "print(\"In third class\", sum(third_class['Survived'])/len(third_class)*100, \"% of passengers survived\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorized_pclass_columns = pandas.get_dummies(preprocessed_data['Pclass'], prefix='Pclass')\n",
    "preprocessed_data = pandas.concat([preprocessed_data, categorized_pclass_columns], axis=1)\n",
    "preprocessed_data = preprocessed_data.drop(['Pclass'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Name</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Pclass_C</th>\n",
       "      <th>Pclass_Q</th>\n",
       "      <th>Pclass_S</th>\n",
       "      <th>Pclass_U</th>\n",
       "      <th>Pclass_1</th>\n",
       "      <th>Pclass_2</th>\n",
       "      <th>Pclass_3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived                                               Name  \\\n",
       "0            1         0                            Braund, Mr. Owen Harris   \n",
       "1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   \n",
       "2            3         1                             Heikkinen, Miss. Laina   \n",
       "3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   \n",
       "4            5         0                           Allen, Mr. William Henry   \n",
       "\n",
       "    Age  SibSp  Parch            Ticket     Fare  Sex_female  Sex_male  \\\n",
       "0  22.0      1      0         A/5 21171   7.2500           0         1   \n",
       "1  38.0      1      0          PC 17599  71.2833           1         0   \n",
       "2  26.0      0      0  STON/O2. 3101282   7.9250           1         0   \n",
       "3  35.0      1      0            113803  53.1000           1         0   \n",
       "4  35.0      0      0            373450   8.0500           0         1   \n",
       "\n",
       "   Pclass_C  Pclass_Q  Pclass_S  Pclass_U  Pclass_1  Pclass_2  Pclass_3  \n",
       "0         0         0         1         0         0         0         1  \n",
       "1         1         0         0         0         1         0         0  \n",
       "2         0         0         1         0         0         0         1  \n",
       "3         0         0         1         0         1         0         0  \n",
       "4         0         0         1         0         0         0         1  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessed_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 13.3.3 Binning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]\n",
    "categorized_age = pandas.cut(preprocessed_data['Age'], bins)\n",
    "preprocessed_data['Categorized_age'] = categorized_age\n",
    "preprocessed_data = preprocessed_data.drop([\"Age\"], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Name</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Pclass_C</th>\n",
       "      <th>Pclass_Q</th>\n",
       "      <th>Pclass_S</th>\n",
       "      <th>Pclass_U</th>\n",
       "      <th>Pclass_1</th>\n",
       "      <th>Pclass_2</th>\n",
       "      <th>Pclass_3</th>\n",
       "      <th>Categorized_age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(20, 30]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>(30, 40]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(20, 30]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>(30, 40]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>(30, 40]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived                                               Name  \\\n",
       "0            1         0                            Braund, Mr. Owen Harris   \n",
       "1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   \n",
       "2            3         1                             Heikkinen, Miss. Laina   \n",
       "3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   \n",
       "4            5         0                           Allen, Mr. William Henry   \n",
       "\n",
       "   SibSp  Parch            Ticket     Fare  Sex_female  Sex_male  Pclass_C  \\\n",
       "0      1      0         A/5 21171   7.2500           0         1         0   \n",
       "1      1      0          PC 17599  71.2833           1         0         1   \n",
       "2      0      0  STON/O2. 3101282   7.9250           1         0         0   \n",
       "3      1      0            113803  53.1000           1         0         0   \n",
       "4      0      0            373450   8.0500           0         1         0   \n",
       "\n",
       "   Pclass_Q  Pclass_S  Pclass_U  Pclass_1  Pclass_2  Pclass_3 Categorized_age  \n",
       "0         0         1         0         0         0         1        (20, 30]  \n",
       "1         0         0         0         1         0         0        (30, 40]  \n",
       "2         0         1         0         0         0         1        (20, 30]  \n",
       "3         0         1         0         1         0         0        (30, 40]  \n",
       "4         0         1         0         0         0         1        (30, 40]  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessed_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "cagegorized_age_columns = pandas.get_dummies(preprocessed_data['Categorized_age'], prefix='Categorized_age')\n",
    "preprocessed_data = pandas.concat([preprocessed_data, cagegorized_age_columns], axis=1)\n",
    "preprocessed_data = preprocessed_data.drop(['Categorized_age'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Name</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Pclass_C</th>\n",
       "      <th>...</th>\n",
       "      <th>Pclass_2</th>\n",
       "      <th>Pclass_3</th>\n",
       "      <th>Categorized_age_(0, 10]</th>\n",
       "      <th>Categorized_age_(10, 20]</th>\n",
       "      <th>Categorized_age_(20, 30]</th>\n",
       "      <th>Categorized_age_(30, 40]</th>\n",
       "      <th>Categorized_age_(40, 50]</th>\n",
       "      <th>Categorized_age_(50, 60]</th>\n",
       "      <th>Categorized_age_(60, 70]</th>\n",
       "      <th>Categorized_age_(70, 80]</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived                                               Name  \\\n",
       "0            1         0                            Braund, Mr. Owen Harris   \n",
       "1            2         1  Cumings, Mrs. John Bradley (Florence Briggs Th...   \n",
       "2            3         1                             Heikkinen, Miss. Laina   \n",
       "3            4         1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   \n",
       "4            5         0                           Allen, Mr. William Henry   \n",
       "\n",
       "   SibSp  Parch            Ticket     Fare  Sex_female  Sex_male  Pclass_C  \\\n",
       "0      1      0         A/5 21171   7.2500           0         1         0   \n",
       "1      1      0          PC 17599  71.2833           1         0         1   \n",
       "2      0      0  STON/O2. 3101282   7.9250           1         0         0   \n",
       "3      1      0            113803  53.1000           1         0         0   \n",
       "4      0      0            373450   8.0500           0         1         0   \n",
       "\n",
       "   ...  Pclass_2  Pclass_3  Categorized_age_(0, 10]  Categorized_age_(10, 20]  \\\n",
       "0  ...         0         1                        0                         0   \n",
       "1  ...         0         0                        0                         0   \n",
       "2  ...         0         1                        0                         0   \n",
       "3  ...         0         0                        0                         0   \n",
       "4  ...         0         1                        0                         0   \n",
       "\n",
       "   Categorized_age_(20, 30]  Categorized_age_(30, 40]  \\\n",
       "0                         1                         0   \n",
       "1                         0                         1   \n",
       "2                         1                         0   \n",
       "3                         0                         1   \n",
       "4                         0                         1   \n",
       "\n",
       "   Categorized_age_(40, 50]  Categorized_age_(50, 60]  \\\n",
       "0                         0                         0   \n",
       "1                         0                         0   \n",
       "2                         0                         0   \n",
       "3                         0                         0   \n",
       "4                         0                         0   \n",
       "\n",
       "   Categorized_age_(60, 70]  Categorized_age_(70, 80]  \n",
       "0                         0                         0  \n",
       "1                         0                         0  \n",
       "2                         0                         0  \n",
       "3                         0                         0  \n",
       "4                         0                         0  \n",
       "\n",
       "[5 rows x 24 columns]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessed_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 13.3.3 Feature selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessed_data = preprocessed_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Pclass_C</th>\n",
       "      <th>Pclass_Q</th>\n",
       "      <th>Pclass_S</th>\n",
       "      <th>Pclass_U</th>\n",
       "      <th>...</th>\n",
       "      <th>Pclass_2</th>\n",
       "      <th>Pclass_3</th>\n",
       "      <th>Categorized_age_(0, 10]</th>\n",
       "      <th>Categorized_age_(10, 20]</th>\n",
       "      <th>Categorized_age_(20, 30]</th>\n",
       "      <th>Categorized_age_(30, 40]</th>\n",
       "      <th>Categorized_age_(40, 50]</th>\n",
       "      <th>Categorized_age_(50, 60]</th>\n",
       "      <th>Categorized_age_(60, 70]</th>\n",
       "      <th>Categorized_age_(70, 80]</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  SibSp  Parch     Fare  Sex_female  Sex_male  Pclass_C  Pclass_Q  \\\n",
       "0         0      1      0   7.2500           0         1         0         0   \n",
       "1         1      1      0  71.2833           1         0         1         0   \n",
       "2         1      0      0   7.9250           1         0         0         0   \n",
       "3         1      1      0  53.1000           1         0         0         0   \n",
       "4         0      0      0   8.0500           0         1         0         0   \n",
       "\n",
       "   Pclass_S  Pclass_U  ...  Pclass_2  Pclass_3  Categorized_age_(0, 10]  \\\n",
       "0         1         0  ...         0         1                        0   \n",
       "1         0         0  ...         0         0                        0   \n",
       "2         1         0  ...         0         1                        0   \n",
       "3         1         0  ...         0         0                        0   \n",
       "4         1         0  ...         0         1                        0   \n",
       "\n",
       "   Categorized_age_(10, 20]  Categorized_age_(20, 30]  \\\n",
       "0                         0                         1   \n",
       "1                         0                         0   \n",
       "2                         0                         1   \n",
       "3                         0                         0   \n",
       "4                         0                         0   \n",
       "\n",
       "   Categorized_age_(30, 40]  Categorized_age_(40, 50]  \\\n",
       "0                         0                         0   \n",
       "1                         1                         0   \n",
       "2                         0                         0   \n",
       "3                         1                         0   \n",
       "4                         1                         0   \n",
       "\n",
       "   Categorized_age_(50, 60]  Categorized_age_(60, 70]  \\\n",
       "0                         0                         0   \n",
       "1                         0                         0   \n",
       "2                         0                         0   \n",
       "3                         0                         0   \n",
       "4                         0                         0   \n",
       "\n",
       "   Categorized_age_(70, 80]  \n",
       "0                         0  \n",
       "1                         0  \n",
       "2                         0  \n",
       "3                         0  \n",
       "4                         0  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessed_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 12.3.5 Saving for future use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessed_data.to_csv('preprocessed_titanic_data.csv', index=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 13.4 Training models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Pclass_C</th>\n",
       "      <th>Pclass_Q</th>\n",
       "      <th>Pclass_S</th>\n",
       "      <th>Pclass_U</th>\n",
       "      <th>...</th>\n",
       "      <th>Pclass_2</th>\n",
       "      <th>Pclass_3</th>\n",
       "      <th>Categorized_age_(0, 10]</th>\n",
       "      <th>Categorized_age_(10, 20]</th>\n",
       "      <th>Categorized_age_(20, 30]</th>\n",
       "      <th>Categorized_age_(30, 40]</th>\n",
       "      <th>Categorized_age_(40, 50]</th>\n",
       "      <th>Categorized_age_(50, 60]</th>\n",
       "      <th>Categorized_age_(60, 70]</th>\n",
       "      <th>Categorized_age_(70, 80]</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  SibSp  Parch     Fare  Sex_female  Sex_male  Pclass_C  Pclass_Q  \\\n",
       "0         0      1      0   7.2500           0         1         0         0   \n",
       "1         1      1      0  71.2833           1         0         1         0   \n",
       "2         1      0      0   7.9250           1         0         0         0   \n",
       "3         1      1      0  53.1000           1         0         0         0   \n",
       "4         0      0      0   8.0500           0         1         0         0   \n",
       "\n",
       "   Pclass_S  Pclass_U  ...  Pclass_2  Pclass_3  Categorized_age_(0, 10]  \\\n",
       "0         1         0  ...         0         1                        0   \n",
       "1         0         0  ...         0         0                        0   \n",
       "2         1         0  ...         0         1                        0   \n",
       "3         1         0  ...         0         0                        0   \n",
       "4         1         0  ...         0         1                        0   \n",
       "\n",
       "   Categorized_age_(10, 20]  Categorized_age_(20, 30]  \\\n",
       "0                         0                         1   \n",
       "1                         0                         0   \n",
       "2                         0                         1   \n",
       "3                         0                         0   \n",
       "4                         0                         0   \n",
       "\n",
       "   Categorized_age_(30, 40]  Categorized_age_(40, 50]  \\\n",
       "0                         0                         0   \n",
       "1                         1                         0   \n",
       "2                         0                         0   \n",
       "3                         1                         0   \n",
       "4                         1                         0   \n",
       "\n",
       "   Categorized_age_(50, 60]  Categorized_age_(60, 70]  \\\n",
       "0                         0                         0   \n",
       "1                         0                         0   \n",
       "2                         0                         0   \n",
       "3                         0                         0   \n",
       "4                         0                         0   \n",
       "\n",
       "   Categorized_age_(70, 80]  \n",
       "0                         0  \n",
       "1                         0  \n",
       "2                         0  \n",
       "3                         0  \n",
       "4                         0  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pandas.read_csv('./preprocessed_titanic_data.csv')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 13.4.1 Features-labels split and train-validation split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = data.drop([\"Survived\"], axis=1)\n",
    "labels = data[\"Survived\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remark: we fix random_state the end, to make sure we always get the same split\n",
    "features_train, features_validation_test, labels_train, labels_validation_test = train_test_split(\n",
    "    features, labels, test_size=0.4, random_state=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "features_validation, features_test, labels_validation, labels_test = train_test_split(\n",
    "    features_validation_test, labels_validation_test, test_size=0.5, random_state=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "534\n",
      "178\n",
      "179\n",
      "534\n",
      "178\n",
      "179\n"
     ]
    }
   ],
   "source": [
    "print(len(features_train))\n",
    "print(len(features_validation))\n",
    "print(len(features_test))\n",
    "print(len(labels_train))\n",
    "print(len(labels_validation))\n",
    "print(len(labels_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 13.4.2 Training different models on our dataset\n",
    "\n",
    "We'll train four models:\n",
    "- Logistic regression (perceptron)\n",
    "- Decision tree\n",
    "- Naive Bayes\n",
    "- Support vector machine (SVM)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/luisserrano/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "lr_model = LogisticRegression()\n",
    "lr_model.fit(features_train, labels_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n",
       "                       max_depth=None, max_features=None, max_leaf_nodes=None,\n",
       "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                       min_samples_leaf=1, min_samples_split=2,\n",
       "                       min_weight_fraction_leaf=0.0, presort='deprecated',\n",
       "                       random_state=None, splitter='best')"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "dt_model = DecisionTreeClassifier()\n",
    "dt_model.fit(features_train, labels_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GaussianNB(priors=None, var_smoothing=1e-09)"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "nb_model = GaussianNB()\n",
    "nb_model.fit(features_train, labels_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,\n",
       "    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',\n",
       "    max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "    tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.svm import SVC\n",
    "\n",
    "svm_model = SVC()\n",
    "svm_model.fit(features_train, labels_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
       "                       criterion='gini', max_depth=None, max_features='auto',\n",
       "                       max_leaf_nodes=None, max_samples=None,\n",
       "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                       min_samples_leaf=1, min_samples_split=2,\n",
       "                       min_weight_fraction_leaf=0.0, n_estimators=100,\n",
       "                       n_jobs=None, oob_score=False, random_state=None,\n",
       "                       verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rf_model = RandomForestClassifier()\n",
    "rf_model.fit(features_train, labels_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,\n",
       "                           learning_rate=0.1, loss='deviance', max_depth=3,\n",
       "                           max_features=None, max_leaf_nodes=None,\n",
       "                           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                           min_samples_leaf=1, min_samples_split=2,\n",
       "                           min_weight_fraction_leaf=0.0, n_estimators=100,\n",
       "                           n_iter_no_change=None, presort='deprecated',\n",
       "                           random_state=None, subsample=1.0, tol=0.0001,\n",
       "                           validation_fraction=0.1, verbose=0,\n",
       "                           warm_start=False)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "\n",
    "gb_model = GradientBoostingClassifier()\n",
    "gb_model.fit(features_train, labels_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,\n",
       "                   n_estimators=50, random_state=None)"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "\n",
    "ab_model = AdaBoostClassifier()\n",
    "ab_model.fit(features_train, labels_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 13.4.3 Evaluating the models\n",
    "\n",
    "#### Accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Scores of the models\n",
      "Logistic regression: 0.7696629213483146\n",
      "Decision tree: 0.7865168539325843\n",
      "Naive Bayes: 0.7471910112359551\n",
      "SVM: 0.6797752808988764\n",
      "Random forest: 0.7808988764044944\n",
      "Gradient boosting: 0.8089887640449438\n",
      "AdaBoost: 0.7640449438202247\n"
     ]
    }
   ],
   "source": [
    "print(\"Scores of the models\")\n",
    "print(\"Logistic regression:\", lr_model.score(features_validation, labels_validation))\n",
    "print(\"Decision tree:\", dt_model.score(features_validation, labels_validation))\n",
    "print(\"Naive Bayes:\", nb_model.score(features_validation, labels_validation))\n",
    "print(\"SVM:\", svm_model.score(features_validation, labels_validation))\n",
    "print(\"Random forest:\", rf_model.score(features_validation, labels_validation))\n",
    "print(\"Gradient boosting:\", gb_model.score(features_validation, labels_validation))\n",
    "print(\"AdaBoost:\", ab_model.score(features_validation, labels_validation))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### F1-score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1-scores of the models:\n",
      "Logistic regression: 0.6870229007633588\n",
      "Decision Tree: 0.7205882352941176\n",
      "Naive Bayes: 0.6808510638297872\n",
      "Support Vector Machine: 0.39999999999999997\n",
      "Random Forest: 0.7153284671532848\n",
      "Gradient boosting: 0.7384615384615385\n",
      "AdaBoost: 0.6865671641791045\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "\n",
    "print(\"F1-scores of the models:\")\n",
    "\n",
    "lr_predicted_labels = lr_model.predict(features_validation)\n",
    "print(\"Logistic regression:\", f1_score(labels_validation, lr_predicted_labels))\n",
    "\n",
    "dt_predicted_labels = dt_model.predict(features_validation)\n",
    "print(\"Decision Tree:\", f1_score(labels_validation, dt_predicted_labels))\n",
    "\n",
    "nb_predicted_labels = nb_model.predict(features_validation)\n",
    "print(\"Naive Bayes:\", f1_score(labels_validation, nb_predicted_labels))\n",
    "\n",
    "svm_predicted_labels = svm_model.predict(features_validation)\n",
    "print(\"Support Vector Machine:\", f1_score(labels_validation, svm_predicted_labels))\n",
    "\n",
    "rf_predicted_labels = rf_model.predict(features_validation)\n",
    "print(\"Random Forest:\", f1_score(labels_validation, rf_predicted_labels))\n",
    "\n",
    "gb_predicted_labels = gb_model.predict(features_validation)\n",
    "print(\"Gradient boosting:\", f1_score(labels_validation, gb_predicted_labels))\n",
    "\n",
    "ab_predicted_labels = ab_model.predict(features_validation)\n",
    "print(\"AdaBoost:\", f1_score(labels_validation, ab_predicted_labels))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 13.4.4 Testing the model\n",
    "\n",
    "Finding the accuracy and the F1-score of the model in the testing set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8324022346368715"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gb_model.score(features_test, labels_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8026315789473685"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gb_predicted_test_labels = gb_model.predict(features_test)\n",
    "f1_score(labels_test, gb_predicted_test_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 13.5 Grid search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SVM grid search with a radial basis function kernel\n",
      "C=1, gamma=0.1 0.702247191011236\n",
      "C=1, gamma=1 0.6966292134831461\n",
      "C=1, gamma=10 0.6685393258426966\n",
      "C=10, gamma=0.1 0.7247191011235955\n",
      "C=10, gamma=1 0.6910112359550562\n",
      "C=10, gamma=10 0.651685393258427\n"
     ]
    }
   ],
   "source": [
    "# Grid search with an rbf kernel\n",
    "\n",
    "print(\"SVM grid search with a radial basis function kernel\")\n",
    "\n",
    "# rbf, C=1, gamma=0.1\n",
    "svm_1_01 = SVC(kernel='rbf', C=1, gamma=0.1)\n",
    "svm_1_01.fit(features_train, labels_train)\n",
    "print(\"C=1, gamma=0.1\", svm_1_01.score(features_validation, labels_validation))\n",
    "\n",
    "# rbf, C=1, gamma=1\n",
    "svm_1_1 = SVC(kernel='rbf', C=1, gamma=1)\n",
    "svm_1_1.fit(features_train, labels_train)\n",
    "print(\"C=1, gamma=1\", svm_1_1.score(features_validation, labels_validation))\n",
    "\n",
    "# rbf, C=1, gamma=10\n",
    "svm_1_10 = SVC(kernel='rbf', C=1, gamma=10)\n",
    "svm_1_10.fit(features_train, labels_train)\n",
    "print(\"C=1, gamma=10\", svm_1_10.score(features_validation, labels_validation))\n",
    "\n",
    "# rbf, C=10, gamma=0.1\n",
    "svm_10_01 = SVC(kernel='rbf', C=10, gamma=0.1)\n",
    "svm_10_01.fit(features_train, labels_train)\n",
    "print(\"C=10, gamma=0.1\", svm_10_01.score(features_validation, labels_validation))\n",
    "\n",
    "# rbf, C=10, gamma=1\n",
    "svm_10_1 = SVC(kernel='rbf', C=10, gamma=1)\n",
    "svm_10_1.fit(features_train, labels_train)\n",
    "print(\"C=10, gamma=1\", svm_10_1.score(features_validation, labels_validation))\n",
    "\n",
    "# rbf, C=10, gamma=10\n",
    "svm_10_10 = SVC(kernel='rbf', C=10, gamma=10)\n",
    "svm_10_10.fit(features_train, labels_train)\n",
    "print(\"C=10, gamma=10\", svm_10_10.score(features_validation, labels_validation))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7191011235955056"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svm_parameters = {'kernel': ['rbf'],\n",
    "                  'C': [0.01, 0.1, 1 , 10, 100],\n",
    "                  'gamma': [0.01, 0.1, 1, 10, 100]\n",
    "                }\n",
    "svm = SVC()\n",
    "svm_gs = GridSearchCV(estimator = svm,\n",
    "                      param_grid = svm_parameters)\n",
    "svm_gs.fit(features_train, labels_train)\n",
    "\n",
    "svm_winner = svm_gs.best_estimator_\n",
    "svm_winner\n",
    "\n",
    "svm_winner.score(features_validation, labels_validation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,\n",
       "    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',\n",
       "    max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "    tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svm_winner"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 13.6 Cross validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'mean_fit_time': array([0.01245098, 0.01030803, 0.00838737, 0.00820799, 0.00729985,\n",
       "        0.00674162, 0.00719352, 0.00772071, 0.00829163, 0.00733361,\n",
       "        0.00742478, 0.00799875, 0.00788393, 0.0083982 , 0.00779495,\n",
       "        0.00884881, 0.00944495, 0.00974836, 0.01011376, 0.00908861,\n",
       "        0.01280479, 0.01528444, 0.0100183 , 0.01088009, 0.00922441]),\n",
       " 'std_fit_time': array([1.88711214e-03, 8.47369147e-04, 3.67374713e-04, 2.82492652e-04,\n",
       "        5.12734005e-04, 9.25256522e-05, 2.51734588e-04, 1.84984057e-04,\n",
       "        2.77642775e-04, 2.09819056e-04, 4.43291643e-04, 3.61642128e-04,\n",
       "        1.16118278e-04, 1.69606935e-04, 1.07591899e-04, 2.96022397e-04,\n",
       "        7.69834372e-04, 7.44023667e-04, 5.49712353e-04, 3.40014004e-04,\n",
       "        1.61032865e-03, 2.80923803e-03, 4.40516155e-04, 5.23633501e-04,\n",
       "        6.33528242e-04]),\n",
       " 'mean_score_time': array([0.0043118 , 0.00286026, 0.00230074, 0.00219541, 0.00186715,\n",
       "        0.00189233, 0.00194263, 0.00219169, 0.00228682, 0.00195198,\n",
       "        0.0018856 , 0.00198255, 0.0020298 , 0.00208001, 0.00185266,\n",
       "        0.00171843, 0.00183668, 0.00196934, 0.00221314, 0.0020112 ,\n",
       "        0.00169306, 0.00190129, 0.00225825, 0.00227146, 0.00200768]),\n",
       " 'std_score_time': array([2.22522651e-03, 3.26620902e-04, 1.09212158e-04, 1.14567845e-04,\n",
       "        1.39034186e-04, 5.49592836e-05, 4.66996211e-05, 9.37714063e-05,\n",
       "        1.65250777e-04, 9.58515516e-05, 1.49046161e-04, 1.28407971e-04,\n",
       "        6.59387691e-05, 1.23563625e-04, 5.43319482e-05, 1.05057799e-04,\n",
       "        3.56556878e-05, 8.78995984e-05, 2.38814626e-04, 9.98350122e-05,\n",
       "        8.58271651e-05, 9.56638736e-05, 3.94875942e-04, 1.51884679e-04,\n",
       "        4.93021490e-05]),\n",
       " 'param_C': masked_array(data=[0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1,\n",
       "                    1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 100, 100, 100, 100,\n",
       "                    100],\n",
       "              mask=[False, False, False, False, False, False, False, False,\n",
       "                    False, False, False, False, False, False, False, False,\n",
       "                    False, False, False, False, False, False, False, False,\n",
       "                    False],\n",
       "        fill_value='?',\n",
       "             dtype=object),\n",
       " 'param_gamma': masked_array(data=[0.01, 0.1, 1, 10, 100, 0.01, 0.1, 1, 10, 100, 0.01,\n",
       "                    0.1, 1, 10, 100, 0.01, 0.1, 1, 10, 100, 0.01, 0.1, 1,\n",
       "                    10, 100],\n",
       "              mask=[False, False, False, False, False, False, False, False,\n",
       "                    False, False, False, False, False, False, False, False,\n",
       "                    False, False, False, False, False, False, False, False,\n",
       "                    False],\n",
       "        fill_value='?',\n",
       "             dtype=object),\n",
       " 'param_kernel': masked_array(data=['rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf',\n",
       "                    'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf',\n",
       "                    'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf',\n",
       "                    'rbf'],\n",
       "              mask=[False, False, False, False, False, False, False, False,\n",
       "                    False, False, False, False, False, False, False, False,\n",
       "                    False, False, False, False, False, False, False, False,\n",
       "                    False],\n",
       "        fill_value='?',\n",
       "             dtype=object),\n",
       " 'params': [{'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'},\n",
       "  {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'},\n",
       "  {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'},\n",
       "  {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'},\n",
       "  {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'},\n",
       "  {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'},\n",
       "  {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'},\n",
       "  {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'},\n",
       "  {'C': 0.1, 'gamma': 10, 'kernel': 'rbf'},\n",
       "  {'C': 0.1, 'gamma': 100, 'kernel': 'rbf'},\n",
       "  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'},\n",
       "  {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'},\n",
       "  {'C': 1, 'gamma': 1, 'kernel': 'rbf'},\n",
       "  {'C': 1, 'gamma': 10, 'kernel': 'rbf'},\n",
       "  {'C': 1, 'gamma': 100, 'kernel': 'rbf'},\n",
       "  {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'},\n",
       "  {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'},\n",
       "  {'C': 10, 'gamma': 1, 'kernel': 'rbf'},\n",
       "  {'C': 10, 'gamma': 10, 'kernel': 'rbf'},\n",
       "  {'C': 10, 'gamma': 100, 'kernel': 'rbf'},\n",
       "  {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'},\n",
       "  {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'},\n",
       "  {'C': 100, 'gamma': 1, 'kernel': 'rbf'},\n",
       "  {'C': 100, 'gamma': 10, 'kernel': 'rbf'},\n",
       "  {'C': 100, 'gamma': 100, 'kernel': 'rbf'}],\n",
       " 'split0_test_score': array([0.64485981, 0.64485981, 0.64485981, 0.64485981, 0.64485981,\n",
       "        0.65420561, 0.64485981, 0.64485981, 0.64485981, 0.64485981,\n",
       "        0.71962617, 0.73831776, 0.6728972 , 0.70093458, 0.70093458,\n",
       "        0.82242991, 0.71962617, 0.6728972 , 0.70093458, 0.70093458,\n",
       "        0.80373832, 0.74766355, 0.6635514 , 0.70093458, 0.70093458]),\n",
       " 'split1_test_score': array([0.64485981, 0.64485981, 0.64485981, 0.64485981, 0.64485981,\n",
       "        0.65420561, 0.64485981, 0.64485981, 0.64485981, 0.64485981,\n",
       "        0.64485981, 0.71028037, 0.68224299, 0.6635514 , 0.6635514 ,\n",
       "        0.73831776, 0.73831776, 0.6728972 , 0.65420561, 0.6635514 ,\n",
       "        0.76635514, 0.74766355, 0.64485981, 0.65420561, 0.6635514 ]),\n",
       " 'split2_test_score': array([0.64485981, 0.64485981, 0.64485981, 0.64485981, 0.64485981,\n",
       "        0.6728972 , 0.64485981, 0.64485981, 0.64485981, 0.64485981,\n",
       "        0.71962617, 0.79439252, 0.73831776, 0.70093458, 0.71028037,\n",
       "        0.81308411, 0.79439252, 0.75700935, 0.70093458, 0.71028037,\n",
       "        0.82242991, 0.74766355, 0.73831776, 0.71028037, 0.71028037]),\n",
       " 'split3_test_score': array([0.64485981, 0.64485981, 0.64485981, 0.64485981, 0.64485981,\n",
       "        0.64485981, 0.64485981, 0.64485981, 0.64485981, 0.64485981,\n",
       "        0.70093458, 0.77570093, 0.74766355, 0.70093458, 0.70093458,\n",
       "        0.81308411, 0.75700935, 0.73831776, 0.70093458, 0.70093458,\n",
       "        0.80373832, 0.74766355, 0.74766355, 0.70093458, 0.70093458]),\n",
       " 'split4_test_score': array([0.6509434 , 0.6509434 , 0.6509434 , 0.6509434 , 0.6509434 ,\n",
       "        0.66037736, 0.6509434 , 0.6509434 , 0.6509434 , 0.6509434 ,\n",
       "        0.70754717, 0.80188679, 0.77358491, 0.69811321, 0.70754717,\n",
       "        0.8490566 , 0.76415094, 0.77358491, 0.70754717, 0.69811321,\n",
       "        0.82075472, 0.75471698, 0.77358491, 0.70754717, 0.69811321]),\n",
       " 'mean_test_score': array([0.64607653, 0.64607653, 0.64607653, 0.64607653, 0.64607653,\n",
       "        0.65730912, 0.64607653, 0.64607653, 0.64607653, 0.64607653,\n",
       "        0.69851878, 0.76411568, 0.72294128, 0.69289367, 0.69664962,\n",
       "        0.8071945 , 0.75469935, 0.72294128, 0.6929113 , 0.69476283,\n",
       "        0.80340328, 0.74907424, 0.71359549, 0.69478046, 0.69476283]),\n",
       " 'std_test_score': array([0.00243343, 0.00243343, 0.00243343, 0.00243343, 0.00243343,\n",
       "        0.00923744, 0.00243343, 0.00243343, 0.00243343, 0.00243343,\n",
       "        0.02777652, 0.03476014, 0.03891829, 0.01471177, 0.01695151,\n",
       "        0.03687258, 0.0251742 , 0.04235721, 0.01952157, 0.0161391 ,\n",
       "        0.02017857, 0.00282137, 0.05019872, 0.02061699, 0.0161391 ]),\n",
       " 'rank_test_score': array([17, 17, 17, 17, 17, 16, 17, 17, 17, 17,  9,  3,  6, 15, 10,  1,  4,\n",
       "         6, 14, 12,  2,  5,  8, 11, 12], dtype=int32)}"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svm_gs.cv_results_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercise 13.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = pandas.read_csv('test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId      0\n",
       "Pclass           0\n",
       "Name             0\n",
       "Sex              0\n",
       "Age             86\n",
       "SibSp            0\n",
       "Parch            0\n",
       "Ticket           0\n",
       "Fare             1\n",
       "Cabin          327\n",
       "Embarked         0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "      <th>Embarked_U</th>\n",
       "      <th>Pclass_1</th>\n",
       "      <th>Pclass_2</th>\n",
       "      <th>Pclass_3</th>\n",
       "      <th>Categorized_age_(0, 10]</th>\n",
       "      <th>Categorized_age_(10, 20]</th>\n",
       "      <th>Categorized_age_(20, 30]</th>\n",
       "      <th>Categorized_age_(30, 40]</th>\n",
       "      <th>Categorized_age_(40, 50]</th>\n",
       "      <th>Categorized_age_(50, 60]</th>\n",
       "      <th>Categorized_age_(60, 70]</th>\n",
       "      <th>Categorized_age_(70, 80]</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.8292</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9.6875</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.6625</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>12.2875</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>413</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>414</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>108.9000</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>415</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>416</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>417</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>22.3583</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>418 rows × 20 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     SibSp  Parch      Fare  Sex_female  Sex_male  Embarked_C  Embarked_Q  \\\n",
       "0        0      0    7.8292           0         1           0           1   \n",
       "1        1      0    7.0000           1         0           0           0   \n",
       "2        0      0    9.6875           0         1           0           1   \n",
       "3        0      0    8.6625           0         1           0           0   \n",
       "4        1      1   12.2875           1         0           0           0   \n",
       "..     ...    ...       ...         ...       ...         ...         ...   \n",
       "413      0      0    8.0500           0         1           0           0   \n",
       "414      0      0  108.9000           1         0           1           0   \n",
       "415      0      0    7.2500           0         1           0           0   \n",
       "416      0      0    8.0500           0         1           0           0   \n",
       "417      1      1   22.3583           0         1           1           0   \n",
       "\n",
       "     Embarked_S  Embarked_U  Pclass_1  Pclass_2  Pclass_3  \\\n",
       "0             0           0         0         0         1   \n",
       "1             1           0         0         0         1   \n",
       "2             0           0         0         1         0   \n",
       "3             1           0         0         0         1   \n",
       "4             1           0         0         0         1   \n",
       "..          ...         ...       ...       ...       ...   \n",
       "413           1           0         0         0         1   \n",
       "414           0           0         1         0         0   \n",
       "415           1           0         0         0         1   \n",
       "416           1           0         0         0         1   \n",
       "417           0           0         0         0         1   \n",
       "\n",
       "     Categorized_age_(0, 10]  Categorized_age_(10, 20]  \\\n",
       "0                          0                         0   \n",
       "1                          0                         0   \n",
       "2                          0                         0   \n",
       "3                          0                         0   \n",
       "4                          0                         0   \n",
       "..                       ...                       ...   \n",
       "413                        0                         0   \n",
       "414                        0                         0   \n",
       "415                        0                         0   \n",
       "416                        0                         0   \n",
       "417                        0                         1   \n",
       "\n",
       "     Categorized_age_(20, 30]  Categorized_age_(30, 40]  \\\n",
       "0                           1                         0   \n",
       "1                           0                         1   \n",
       "2                           1                         0   \n",
       "3                           0                         1   \n",
       "4                           0                         1   \n",
       "..                        ...                       ...   \n",
       "413                         1                         0   \n",
       "414                         0                         0   \n",
       "415                         1                         0   \n",
       "416                         0                         1   \n",
       "417                         0                         0   \n",
       "\n",
       "     Categorized_age_(40, 50]  Categorized_age_(50, 60]  \\\n",
       "0                           0                         0   \n",
       "1                           0                         0   \n",
       "2                           0                         0   \n",
       "3                           0                         0   \n",
       "4                           0                         0   \n",
       "..                        ...                       ...   \n",
       "413                         0                         0   \n",
       "414                         1                         0   \n",
       "415                         0                         0   \n",
       "416                         0                         0   \n",
       "417                         0                         0   \n",
       "\n",
       "     Categorized_age_(60, 70]  Categorized_age_(70, 80]  \n",
       "0                           0                         0  \n",
       "1                           0                         0  \n",
       "2                           0                         0  \n",
       "3                           0                         0  \n",
       "4                           0                         0  \n",
       "..                        ...                       ...  \n",
       "413                         0                         0  \n",
       "414                         0                         0  \n",
       "415                         0                         0  \n",
       "416                         0                         0  \n",
       "417                         0                         0  \n",
       "\n",
       "[418 rows x 20 columns]"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cleaning the data\n",
    "test_data = test_data.drop('Cabin', axis=1)\n",
    "test_data[\"Age\"] = test_data[\"Age\"].fillna(28.0)\n",
    "\n",
    "# Catch! The test data has one missing fare. Let's fix that\n",
    "average_fare = test_data[\"Fare\"].mean()\n",
    "test_data['Fare'] = test_data['Fare'].fillna(average_fare)\n",
    "\n",
    "# Preprocessing the data\n",
    "test_gender_columns = pandas.get_dummies(test_data['Sex'], prefix='Sex')\n",
    "test_embarked_columns = pandas.get_dummies(test_data[\"Embarked\"], prefix=\"Embarked\")\n",
    "test_data = pandas.concat([test_data, test_gender_columns], axis=1)\n",
    "test_data = pandas.concat([test_data, test_embarked_columns], axis=1)\n",
    "test_data = test_data.drop(['Sex', 'Embarked'], axis=1)\n",
    "\n",
    "# Another small catch, the test data has no missing 'Embarked' fields. Therefore, the processed test data will not\n",
    "# have an 'Embarked_Q' column. We need to artificially add one filled with zeros.\n",
    "test_data['Embarked_U'] = pandas.DataFrame([0 for i in range(len(test_data))])\n",
    "\n",
    "test_categorized_pclass_columns = pandas.get_dummies(test_data['Pclass'], prefix='Pclass')\n",
    "test_data = pandas.concat([test_data, test_categorized_pclass_columns], axis=1)\n",
    "test_data = test_data.drop(['Pclass'], axis=1)\n",
    "\n",
    "bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]\n",
    "test_categorized_age = pandas.cut(test_data['Age'], bins)\n",
    "test_data['Categorized_age'] = categorized_age\n",
    "test_data = test_data.drop([\"Age\"], axis=1)\n",
    "\n",
    "test_cagegorized_age_columns = pandas.get_dummies(test_data['Categorized_age'], prefix='Categorized_age')\n",
    "test_data = pandas.concat([test_data, test_cagegorized_age_columns], axis=1)\n",
    "test_data = test_data.drop(['Categorized_age'], axis=1)\n",
    "\n",
    "test_data = test_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)\n",
    "test_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, to check how many survivors were predicted by each model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "153"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Logistic regression\n",
    "sum(lr_model.predict(test_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "163"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Decision tree\n",
    "sum(dt_model.predict(test_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "195"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Naive Bayes\n",
    "sum(nb_model.predict(test_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "61"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Support vector machine\n",
    "sum(svm_model.predict(test_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "154"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Random forest\n",
    "sum(rf_model.predict(test_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "156"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Gradient boosting\n",
    "sum(gb_model.predict(test_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "155"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# AdaBoost\n",
    "sum(ab_model.predict(test_data))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since the three strongest models in terms of accuracy were random forests, gradient boosting, and adaboost, and they predicted that 154, 156, and 155 passengers survived out of the 418 in the test set, a good estimate for the number of survivors is the average of these three predictions, or 155."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
